////////////////////////////////////////////////////////////////////////////////

*This do-file cleans the Scopus data on social science publications and combines it with the results from the natural language processing program we run in Python on the papers from Scopus about job (in)security.

////////////////////////////////////////////////////////////////////////////////

cd "$otherdata"

import excel "raw/predictions.xlsx", sheet("Sheet1") cellrange(B1:B3923) firstrow clear
	rename B prediction
	gen n=_n
tempfile predictions
save `predictions'

import excel "raw/ids.xlsx", sheet("Sheet1") cellrange(B1:B3923) firstrow clear
	gen n=_n
merge 1:1 n using `predictions', nogen
	drop n

tempfile predictions
save `predictions'

import excel "raw/abstracts_intros_classified.xlsx", sheet("Sheet1") cellrange(A2:D3923) firstrow clear
	keep B D
	rename (B D) (id classification)
tempfile classes
save `classes'


import excel "raw/dates.xlsx", sheet("Sheet1") cellrange(A1:B3927) firstrow clear
	rename (scopus_id cover_date) (id date)
	destring id, replace
	gen year=substr(date,1,4)
	destring year, replace
	keep id year
	
merge 1:1 id using `predictions'
	keep if _merge==3
	drop _merge
	
merge 1:1 id using `classes'
	keep if _merge==3
	drop _merge
	
replace prediction=classification if !missing(classification)&prediction!=classification

la var id "Scopus ID"
la var year "Year"
la var prediction "NLP model prediction"
la var classification "Hand classification"

save "clean/nlp.dta", replace
